The simulation, technically, would be established on a continuous space-time domain, even though the final “observations” would be a discrete realization.
Take a Gaussian process for example:
\[Y_i(\mathbf{s}, t) = \eta_i(\mathbf{s}, t) +\epsilon_i(\mathbf{s}, t)\] 2. The true latent process is composed of a fixed process and a random (subject-specific) process.
\[\eta_i(\mathbf{s}, t) = \mu(\mathbf{s}, t)+b_i(\mathbf{s}, t)\]
\[\epsilon_i(\mathbf{s}, t) = \frac{1}{N_r}\sum_{\mathbf{s'} \in S_r}Z(\mathbf{s'}, t)\]
where:
\[Y'_i(\mathbf{s},t) = \beta_i(t)Y_i(\mathbf{s},t) + b'_i(\mathbf{s},t) + \epsilon_i(\mathbf{s}, t)\]
For start, let’s generate from the following scheme:
\[\begin{aligned} Y_{1i}(\mathbf{s}, t) &= \phi(\mathbf{s})\xi_{i1}+t\xi_{i2}+\epsilon_i(\mathbf{s}, t), \ \xi_{ik} \sim N(0, \sigma^2_{k}) \\ \phi_1(\mathbf{s}) &= \sqrt{s_1^2+s_2^2}\\ Y_{2i}(\mathbf{s},t) &= 2tY_{1i}(\mathbf{s},t)+\sigma^2(\mathbf{s}, t)\\ \end{aligned}\]
In the real dataset, a lot of things would be different for each lesion (e.g., the correlation, size of image, center of image, correlation between two outcomes). However, the current set up seems to imply constant correlation across subjects.
# set up spcae-time grid
# generate a 2D image of 256 by 256
sid1 <- sid2 <- 1:256
nS <- 256
df_grid <- expand_grid(sid1, sid2) %>%
mutate(s1 = as.vector(scale(sid1)), s2 = as.vector(scale(sid2))) %>%
mutate(dist = sqrt(s1^2+s2^2))
## we would need distance to center for the random effect of Y1
# times of scan
t <- seq(0.2, 1 , by = 0.2)
nT <- length(t)
df_grid <- expand_grid(df_grid, t=t)
## 256^2*5 = 327680 observations for each subject
# look at distance to center
df_grid %>%
ggplot()+
geom_tile(aes(x=s1, y=s2, fill = dist))+
facet_wrap(~t)
N <- 100 # sample size
K1 <- 2 # number of coefficients in random effect of Y1
xi_mat <- mvtnorm::rmvnorm(N, mean = rep(0, K1), sigma = diag(rep(1, K1)))
# container
df_subj <- expand_grid(id = 1:N, df_grid)
df_subj$Y1 <- df_subj$Y2 <- NA
## N * 256^2 * nT = 3276800 obesrvations in total
# kernel size for moving average
ma_size <- 15
pb <- txtProgressBar(min=0, max=N, style = 3)
t1 <- Sys.time()
for(i in 1:N){ # fix a subject
xi_i <- xi_mat[i, ]
for(this_t in t){ # fix a time point
dist_it <- df_subj$dist[df_subj$id==i & df_subj$t==this_t]
# generate Y1
## a moving average error
Zmat_it <- matrix(rnorm(nS^2, 0, 5), nS, nS)
MA_err_it <- MA_rand_field(ma_size, Zmat_it) # flatten by column
MA_err_it <- as.vector(MA_err_it)
# outcome
Y1_it <- dist_it*xi_i[1]+this_t*xi_i[2]+MA_err_it
df_subj$Y1[df_subj$id==i & df_subj$t==this_t] <- Y1_it
# generate Y2
Y2_it <- 2*this_t*Y1_it+rnorm(length(Y1_it))
df_subj$Y2[df_subj$id==i & df_subj$t==this_t] <- Y2_it
}
setTxtProgressBar(pb, i)
}
t2 <- Sys.time()
close(pb)
It took 6.814 minutes to generate data for 100 subjects.
Below is an example of one subject:
df_subj %>%
filter(id==25) %>%
ggplot()+
geom_tile(aes(x=s1, y=s2, fill = Y1))+
facet_wrap(~t, nrow = 1)+
labs(title = "Y1")
df_subj %>%
filter(id==25) %>%
ggplot()+
geom_tile(aes(x=s1, y=s2, fill = Y2))+
facet_wrap(~t, nrow = 1)+
labs(title = "Y2")
df_subj %>%
filter(id==25) %>%
ggplot(aes(x=Y1, y=Y2))+
geom_point(size = 0.2)+
geom_smooth(formula = 'y~x', method = "lm")+
stat_cor(method = "pearson")+
facet_wrap(~t, nrow = 1)
Below I’d like to calculate the slope from linear regression of \(Y_{2i}\) wrt \(Y_{it}\), get the 95% confidence interval and check its coverage rate (how many times the confidence interval covers the true slope):
\[Y_{2i}(\mathbf{s}|t) = \beta_{0ti}+\beta_{1ti}Y_{1i}(\mathbf{s}|t)+\epsilon_i(\mathbf{s}|t)\]
get_beta_ci <- function(x){
fit_lm <- lm(Y2~Y1, data = x)
beta1 <- coef(fit_lm)["Y1"]
ci <- confint(fit_lm)["Y1", ]
return(data.frame(beta1 = beta1, cil = ci[1], ciu = ci[2]))
}
df_subj %>% group_by(id, t) %>%
group_modify(~get_beta_ci(.)) %>%
mutate(true_beta1 = 2*t) %>%
mutate(cover = (cil <= true_beta1 & ciu >= true_beta1)) %>%
group_by(t) %>%
summarise(cov_rate = mean(cover))
## # A tibble: 5 × 2
## t cov_rate
## <dbl> <dbl>
## 1 0.2 0.9
## 2 0.4 0.97
## 3 0.6 0.96
## 4 0.8 0.96
## 5 1 0.95
df_subj %>% group_by(id, t) %>%
group_modify(~get_beta_ci(.)) %>%
mutate(true_beta1 = 2*t) %>%
ggplot()+
coord_flip()+
geom_errorbar(aes(ymin=cil, ymax=ciu, x = id))+
geom_point(aes(x=id, y=beta1), size = 0.2)+
geom_point(aes(x=id, y=true_beta1), col = "red", size = 0.2)+
facet_wrap(~t, nrow = 1, scales = "free_x")
In the previous sample, spatial correlation of \(Y_2\) is much weaker than \(Y_1\). What if we introduce a spatial MA error in \(Y_2\) as well?
\[\begin{aligned} Y_{1i}(\mathbf{s}, t) &= \phi(\mathbf{s})\xi_{i1}+t\xi_{i2}+\epsilon_{1i}(\mathbf{s}, t), \ \xi_{ik} \sim N(0, \sigma^2_{k}) \\ \phi_1(\mathbf{s}) &= \sqrt{s_1^2+s_2^2}\\ Y_{2i}(\mathbf{s},t) &= 2tY_{1i}(\mathbf{s},t)+\epsilon_{2i}(\mathbf{s}, t)\\ \end{aligned}\]
df_subj2 <- df_subj %>%
select(-Y1, -Y2) %>%
mutate(Y1=NA, Y2=NA)
pb <- txtProgressBar(min=0, max=N, style = 3)
t1 <- Sys.time()
for(i in 1:N){ # fix a subject
xi_i <- xi_mat[i, ]
for(this_t in t){ # fix a time point
dist_it <- df_subj$dist[df_subj$id==i & df_subj$t==this_t]
# generate Y1
## a moving average error
Zmat_it1 <- matrix(rnorm(nS^2, 0, 5), nS, nS)
MA_err_it1 <- MA_rand_field(ma_size, Zmat_it1) # flatten by column
MA_err_it1 <- as.vector(MA_err_it1)
# outcome
Y1_it <- dist_it*xi_i[1]+this_t*xi_i[2]+MA_err_it1
df_subj2$Y1[df_subj2$id==i & df_subj2$t==this_t] <- Y1_it
# generate Y2
## a moving average error
Zmat_it2 <- matrix(rnorm(nS^2, 0, 5), nS, nS)
MA_err_it2 <- MA_rand_field(ma_size, Zmat_it2) # flatten by column
MA_err_it2 <- as.vector(MA_err_it2)
Y2_it <- 2*this_t*Y1_it+MA_err_it2
df_subj2$Y2[df_subj2$id==i & df_subj2$t==this_t] <- Y2_it
}
setTxtProgressBar(pb, i)
}
t2 <- Sys.time()
close(pb)
df_subj2 %>%
filter(id==25) %>%
ggplot()+
geom_tile(aes(x=s1, y=s2, fill = Y1))+
facet_wrap(~t, nrow = 1)+
labs(title = "Y1")
df_subj2 %>%
filter(id==25) %>%
ggplot()+
geom_tile(aes(x=s1, y=s2, fill = Y2))+
facet_wrap(~t, nrow = 1)+
labs(title = "Y2")
df_subj2 %>%
filter(id==25) %>%
ggplot(aes(x=Y1, y=Y2))+
geom_point(size = 0.2)+
geom_smooth(formula = 'y~x', method = "lm")+
stat_cor(method = "pearson")+
facet_wrap(~t, nrow = 1)
df_subj2 %>% group_by(id, t) %>%
group_modify(~get_beta_ci(.)) %>%
mutate(true_beta1 = 2*t) %>%
mutate(cover = (cil <= true_beta1 & ciu >= true_beta1)) %>%
group_by(t) %>%
summarise(cov_rate = mean(cover))
## # A tibble: 5 × 2
## t cov_rate
## <dbl> <dbl>
## 1 0.2 0.11
## 2 0.4 0.06
## 3 0.6 0.13
## 4 0.8 0.12
## 5 1 0.11
df_subj2 %>% group_by(id, t) %>%
group_modify(~get_beta_ci(.)) %>%
mutate(true_beta1 = 2*t) %>%
ggplot()+
coord_flip()+
geom_errorbar(aes(ymin=cil, ymax=ciu, x = id))+
geom_point(aes(x=id, y=beta1), size = 0.2)+
geom_point(aes(x=id, y=true_beta1), col = "red", size = 0.2)+
facet_wrap(~t, nrow = 1, scales = "free_x")
df_subj2 %>% filter(id==1 & t==0.2) %>%
gls(Y2~Y1, data = .,
correlation = corGaus(value=0.2, ~s1+s2))
## Error: 'sumLenSq := sum(table(groups)^2)' = 4.29497e+09 is too large.
## Too large or no groups in your correlation structure?
Because the original data size is too large for nlme::gls to handle, I have reduced the spatial grid size to 32 by 32 (taking the 8th, 16th, 24th,… point along both axis).
# reduce grid
reduce_sid <- seq(0, 256, by = 8)
df_subj2 <- df_subj2 %>% filter(sid1 %in% reduce_sid & sid2 %in% reduce_sid)
beta_ci_gls <- function(x){
fit_gls <- gls(Y2~Y1, data = x, correlation = corGaus(value=0.2, ~s1+s2))
beta1 <- coef(fit_gls)["Y1"]
ci <- confint(fit_gls)["Y1", ]
return(data.frame(beta1 = beta1, cil = ci[1], ciu = ci[2]))
}
t1 <- Sys.time()
df_subj2 %>% group_by(id, t) %>%
group_modify(~beta_ci_gls(.)) %>%
mutate(true_beta1 = 2*t) %>%
mutate(cover = (cil <= true_beta1 & ciu >= true_beta1)) %>%
group_by(t) %>%
summarise(cov_rate = mean(cover))
## # A tibble: 5 × 2
## t cov_rate
## <dbl> <dbl>
## 1 0.2 0.98
## 2 0.4 0.96
## 3 0.6 0.91
## 4 0.8 0.94
## 5 1 0.97
t2 <- Sys.time()
The GLS process for all subjects at all time points on reduced data took 6.805 hours.
Compare this coverage rate with regular LM coverage rate on reduced data:
df_subj2 %>% group_by(id, t) %>%
group_modify(~get_beta_ci(.)) %>%
mutate(true_beta1 = 2*t) %>%
mutate(cover = (cil <= true_beta1 & ciu >= true_beta1)) %>%
group_by(t) %>%
summarise(cov_rate = mean(cover))
## # A tibble: 5 × 2
## t cov_rate
## <dbl> <dbl>
## 1 0.2 0.78
## 2 0.4 0.82
## 3 0.6 0.69
## 4 0.8 0.82
## 5 1 0.73